{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8c7beac3-9c70-4e59-ba6a-292fa4e130a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sec_parser as sp\n",
    "from sec_parser.semantic_elements.table_element.table_parser import TableParser\n",
    "import os,re, sys, random\n",
    "import pandas as pd\n",
    "import warnings, time\n",
    "from collections import deque\n",
    "from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed\n",
    "from tqdm import tqdm\n",
    "from fuzzywuzzy import fuzz\n",
    "from datetime import datetime\n",
    "\n",
    "def log_print(message):\n",
    "    with open(\"inv_logfile.log\", \"a\") as log_file:\n",
    "        print(message, file=log_file)  # writes message to the log file\n",
    "num_cpus = os.cpu_count()\n",
    "warnings.filterwarnings('ignore')\n",
    "sys.setrecursionlimit(10_000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d94e4ad9-199c-4f05-9f4d-11566a01534b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from difflib import SequenceMatcher\n",
    "\n",
    "def partial_ratio(s1, s2):\n",
    "    \"\"\"Return the ratio of the most similar substring\n",
    "    as a number between 0 and 100.\"\"\"\n",
    "\n",
    "    shorter = s1\n",
    "    longer = s2\n",
    "\n",
    "    m = SequenceMatcher(None, shorter, longer, autojunk=False)\n",
    "    blocks = m.get_matching_blocks()\n",
    "\n",
    "    # each block represents a sequence of matching characters in a string\n",
    "    # of the form (idx_1, idx_2, len)\n",
    "    # the best partial match will block align with at least one of those blocks\n",
    "    #   e.g. shorter = \"abcd\", longer = XXXbcdeEEE\n",
    "    #   block = (1,3,3)\n",
    "    #   best score === ratio(\"abcd\", \"Xbcd\")\n",
    "    scores = []\n",
    "    for (short_start, long_start, _) in blocks:\n",
    "        new_long_start = max(0, long_start - short_start)\n",
    "        new_long_end = new_long_start + len(shorter)\n",
    "        long_substr = longer[new_long_start:new_long_end]\n",
    "\n",
    "        m2 = SequenceMatcher(None, shorter, long_substr, autojunk=False)\n",
    "        r = m2.ratio()\n",
    "        if r > .995:\n",
    "            return 100\n",
    "        else:\n",
    "            scores.append(r)\n",
    "\n",
    "    return max(scores) * 100.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "afc4d959-a8bd-4349-b267-5db39d2cc55d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_units(df, unit_multiplier):\n",
    "    share_keywords = ['share', 'basic', 'diluted']\n",
    "    # Iterate over each row in the DataFrame\n",
    "    for index, row in df.iterrows():\n",
    "        # Iterate over each column in the row\n",
    "        for col in df.columns:\n",
    "            # Check if the cell value can be converted to a numeric value\n",
    "            if df.at[index, col] is not None and df.at[index, col].isnumeric():\n",
    "                # Apply the unit multiplier to the numeric value\n",
    "                df.at[index, col] = int(df.at[index, col]) * unit_multiplier\n",
    "    return df\n",
    "\n",
    "def is_schedule(title, keywords):\n",
    "    if any(partial_ratio(kw, title) > 80 for kw in keywords):\n",
    "        return True\n",
    "    return False\n",
    "\n",
    "def match_units(text):\n",
    "    units_pattern = re.compile(r'thousand|million|billion', re.IGNORECASE)\n",
    "    multipliers = {'thousand': 1e3, 'million':1e6, 'billion':1e9}\n",
    "\n",
    "    units_match = re.search(units_pattern, text)\n",
    "    if units_match:\n",
    "        mult = multipliers.get(units_match.group(0).lower())\n",
    "        return mult if mult is not None else 1\n",
    "    return 1\n",
    "\n",
    "def is_non_empty_row(row):\n",
    "    # Remove newline characters inside <td> elements for cleaner matching\n",
    "    cleaned_row = re.sub(r'\\n', '', row)\n",
    "    # Check if the row contains any non-empty <td> elements\n",
    "    return bool(re.search(r'<td[^>]*>.*?\\S.*?</td>', cleaned_row, re.IGNORECASE))\n",
    "\n",
    "def find_table(tree, filing_year, keywords):\n",
    "    \"\"\"Perform a DFS to find tables with titles containing specified keywords and exclude tables mentioning the previous year.\"\"\"\n",
    "    previous_year = str(filing_year - 2)\n",
    "    stack = list(tree)\n",
    "    tables = []\n",
    "    found_title = False\n",
    "    \n",
    "    units_multiplier = 1\n",
    "\n",
    "    while stack:\n",
    "        node = stack.pop()\n",
    "                        \n",
    "        # If a title was found, look for the table element\n",
    "        # [Title Found Context]\n",
    "        if found_title:\n",
    "            if hasattr(node, 'semantic_element') and isinstance(node.semantic_element, sp.TableElement):\n",
    "                df = TableParser(node.semantic_element.get_source_code().replace(\"$\",\"\")).parse_as_df()\n",
    "                tables.append(df)\n",
    "                found_title = False  # Reset found_title after finding the table\n",
    "                # Note: should I continue here or keep searching for extension tables?\n",
    "                \n",
    "            # If you find a title or subtitle that includes a different date than the filing date, continue\n",
    "            elif hasattr(node, 'semantic_element') and isinstance(node.semantic_element, (sp.TitleElement, sp.SupplementaryText, sp.TextElement)):\n",
    "                if previous_year in node.text:\n",
    "                    continue\n",
    "                if units_multiplier == 1:\n",
    "                        units_multiplier = match_units(text)\n",
    "\n",
    "        # [Outside Title Found Context]\n",
    "        else:\n",
    "            # Check if the node is a title element\n",
    "            if hasattr(node, 'semantic_element') and isinstance(node.semantic_element, sp.TitleElement):\n",
    "                text = node.text.lower()\n",
    "                # If the title element is the title of the schedule of investments\n",
    "                if text and is_schedule(text, keywords):\n",
    "                    found_title = True\n",
    "                    if units_multiplier == 1:\n",
    "                        units_multiplier = match_units(text)\n",
    "            if hasattr(node, 'semantic_element') and isinstance(node.semantic_element, (sp.TextElement,sp.SupplementaryText)):\n",
    "               # Check if this text element contains only one of the keywords\n",
    "                stripped_text = re.sub(r'[^a-zA-Z\\s]', '', node.text).strip().lower()\n",
    "                if any(fuzz.ratio(kw, stripped_text) > 70 for kw in keywords):\n",
    "                    found_title = True\n",
    "                    if units_multiplier == 1:\n",
    "                        units_multiplier = match_units(text)\n",
    "            elif hasattr(node, 'semantic_element') and isinstance(node.semantic_element, sp.TableElement):\n",
    "                first_three_rows = node.semantic_element.get_source_code().lower().replace(\"$\", \"\").split(\"</tr>\")\n",
    "                # Filter out rows that are effectively empty\n",
    "                first_three_rows = [row for row in first_three_rows if is_non_empty_row(row)][:3]\n",
    "            \n",
    "                # Check if any cell in the first row satisfies the condition defined by is_schedule\n",
    "                if any(is_schedule(cell, keywords) for cell in first_three_rows):\n",
    "                    df = TableParser(node.semantic_element.get_source_code().replace(\"$\",\"\")).parse_as_df()\n",
    "                    tables.append(df)\n",
    "                    if units_multiplier == 1:\n",
    "                        units_multiplier = match_units(node.semantic_element.get_source_code().lower())\n",
    "                        \n",
    "        # Add all children to the stack\n",
    "        stack.extend(reversed(node.children))\n",
    "    \n",
    "    tables_dfs = [apply_units(df, units_multiplier) for df in tables] if units_multiplier > 1 else tables\n",
    "    return (tables_dfs, units_multiplier)\n",
    "\n",
    "def process_file(filing, i):\n",
    "    with open(filing, 'r') as f:\n",
    "        inv_keywords = [\n",
    "            \"schedule of investment\", \"statement of investment\", \n",
    "            \"schedules of investment\", \"statements of investment\",\n",
    "            \"investments at fair value\", \"investment portfolio\",\n",
    "            \"portfolio of investment\", \"portfolio securities\",\n",
    "            \"summary of investment\"\n",
    "        ]\n",
    "            \n",
    "        html_content = f.read().replace(\",\", \"\")\n",
    "                   \n",
    "        # Count the number of HTML tags\n",
    "        pattern = re.compile(r'<[^>]+>')\n",
    "        num_tags = len(pattern.findall(html_content))\n",
    "        \n",
    "        if num_tags < 400:\n",
    "            log_print(f\"[HTML ERROR]: The following filing is not formatted as a HTML file: {filing}\")\n",
    "            return 0 \n",
    "        \n",
    "        if not any(kw in html_content.lower() for kw in inv_keywords):\n",
    "            log_print(f\"[BDC ERROR]: The following filing does not contain a schedule of investments: {filing}\")\n",
    "            return 0\n",
    "        \n",
    "        if not any(partial_ratio(kw, html_content.lower()) > 70 for kw in inv_keywords):\n",
    "            log_print(f\"[BDC ERROR]: The following filing does not contain a schedule of investments: {filing}\")\n",
    "            return 0\n",
    "        \n",
    "        patterns = {\n",
    "            'company conformed name': r'company conformed name:\\s*(.*?)\\n',\n",
    "            'filed as of date': r'filed as of date:\\s*(.*?)\\n',\n",
    "            'conformed submission type': r'conformed submission type:\\s*(.*?)\\n',\n",
    "        }\n",
    "        results = {}\n",
    "        for field, pattern in patterns.items():\n",
    "            match = re.search(pattern, html_content, re.IGNORECASE)\n",
    "            results[field] = match.group(1).strip() if match else None\n",
    "\n",
    "        company_name = results.get('company conformed name')\n",
    "        company_name = company_name.replace(\"/\", \"-\").replace(\"\\\\\", \"-\") if company_name else \"Unknown Company\"\n",
    "        filing_date = results.get('filed as of date')\n",
    "        submission_type = results.get('conformed submission type')\n",
    "        year = int(filing_date[:4])\n",
    "        cik = os.path.basename(os.path.dirname(filing))\n",
    "        elements = sp.Edgar10QParser().parse(html_content)\n",
    "        tree = sp.TreeBuilder().build(elements)\n",
    "        \n",
    "        # Perform DFS to search for the table with the specified title in the entire tree\n",
    "        tables, units_multiplier = find_table(tree, year, inv_keywords)\n",
    "        \n",
    "        if len(tables) > 0:\n",
    "            log_print(f\"Schedule of Investments found for {filing}\")\n",
    "\n",
    "            for i,df in enumerate(tables):\n",
    "                filename = f\"{submission_type}_{company_name}_{cik}_{filing_date}_{i}_INV.csv\"\n",
    "                df.to_csv(os.path.join(\"output_inv_final\", filename))\n",
    "            return 0\n",
    "        else:\n",
    "            log_print(f\"[Tables Error] Schedule of Investment not found for {filing}\")\n",
    "\n",
    "        return 1\n",
    "\n",
    "def filter_files(parent_directory):\n",
    "    subdirectories = [os.path.join(parent_directory, d) for d in os.listdir(parent_directory) if os.path.isdir(os.path.join(parent_directory, d))]\n",
    "    total_files = []\n",
    "    \n",
    "    for subdir in subdirectories:\n",
    "        files_in_subdir = [\n",
    "            os.path.join(subdir, file) \n",
    "            for file in os.listdir(subdir) \n",
    "            if not (file.startswith(\"8-K\") or file.startswith(\"10-KA\") or file.startswith(\"NT\") or file.startswith(\"10-QA\"))\n",
    "        ]\n",
    "\n",
    "        # Create a dictionary to store the files for each year\n",
    "        files_by_year = {}\n",
    "        for file in files_in_subdir:\n",
    "            # Extract the year from the filename\n",
    "            match = re.search(r'_(\\d{4})-', file)\n",
    "            if match:\n",
    "                year = match.group(1)\n",
    "                if year not in files_by_year:\n",
    "                    files_by_year[year] = []\n",
    "                files_by_year[year].append(file)\n",
    "        \n",
    "        for year, files in files_by_year.items():\n",
    "            # Find 10-K files first\n",
    "            ten_k_files = [f for f in files if '10-K_' in f]\n",
    "            if ten_k_files:\n",
    "                total_files.append(ten_k_files[0])\n",
    "            else:\n",
    "                # If no 10-K files, find 10-Q files\n",
    "                ten_q_files = [f for f in files if '10-Q_' in f]\n",
    "                if ten_q_files:\n",
    "                    ten_q_files.sort(key=lambda x: datetime.strptime(re.search(r'_(\\d{4}-\\d{2}-\\d{2})_', x).group(1), '%Y-%m-%d'))\n",
    "                    total_files.append(ten_q_files[-1])\n",
    "    return total_files\n",
    "\n",
    "def main(parent_directory):\n",
    "    total_files = filter_files(parent_directory)\n",
    "    with tqdm(total=len(total_files), file=sys.stdout) as pbar:\n",
    "        with ProcessPoolExecutor(max_workers=num_cpus) as executor:\n",
    "            futures = {executor.submit(process_file, file, i): file for i,file in enumerate(total_files)}\n",
    "            for future in as_completed(futures, timeout=30):\n",
    "                try:\n",
    "                    status = future.result()\n",
    "                except Exception as e:\n",
    "                    log_print(f\"Error in future {futures[future]}: {e}\")\n",
    "                finally:\n",
    "                    pbar.update(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b51e8f22-1b5b-431c-9757-296da4518694",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mRunning cells with 'Python 3.9.13' requires the ipykernel package.\n",
      "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
      "\u001b[1;31mCommand: 'c:/msys64/mingw64/bin/python.exe -m pip install ipykernel -U --user --force-reinstall'"
     ]
    }
   ],
   "source": [
    "if __name__ == \"__main__\":   \n",
    "    try:\n",
    "        os.remove(\"inv_logfile.log\")\n",
    "    except:\n",
    "        pass\n",
    "    \n",
    "    os.makedirs(\"output_inv_final\", exist_ok=True)\n",
    "\n",
    "    parent_directory = \"/BDC_Data\"\n",
    "    start_time = time.time()\n",
    "\n",
    "    main(parent_directory)\n",
    "\n",
    "    end_time = time.time()\n",
    "    runtime = end_time - start_time\n",
    "    print(f'The runtime of the program is: {runtime} seconds')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
